Calendar Heatmaps

Source code can be obtained here

We’ll use an attacker data with 200,000 “events”. It has the timestamp, and the source_country ISO 3166-1 alpha-2 country code (which is the source of the attack) plus the tz time zone of the source IP address. Let’s have a look:

library(DT)
library(dplyr)
library(scales)
library(viridis)
library(ggplot2)
library(ggthemes)
library(gridExtra)
library(lubridate)
library(data.table)
setwd('/Users/ethen/Business-Analytics/articles/calendar_heatmaps')

attacks <- fread("data/eventlog.csv")
DT::datatable(attacks)

A quick exploratory data analysis on this attacker data is what were the “working hours” by country, and we can visualize this with a calendar heatmap. To create the heatmap, we’ll need the weekday and hour of each event, or as granular as you want to get.

make_hr_wkday <- function( ts, sc, tz ){
    # convert each time with the appropriate timezone,
    # the timezone parameter, tz, only takes a single value,
    # then extract its weekdays and hour
    real_times <- ymd_hms( ts, tz = tz[1], quiet = TRUE )
    dt <- data.table( source_country = sc,
                      wkday = weekdays(real_times),
                      hour = hour(real_times) )
    return(dt)
}

# simply stores the name of the weekday
# convert weekday and hour into factor so they'll be ordered
# when plotting
weekday_levels <- levels( weekdays( 0, abbreviate = FALSE ) )
attacks <- attacks %>% 
           group_by(tz) %>%
           do( make_hr_wkday( .$timestamp, .$source_country, .$tz ) ) %>% 
           ungroup() %>% 
           mutate( wkday = factor( wkday, levels = weekday_levels ),
                   hour  = factor( hour, levels = 0:23 ) )
head(attacks)
## Source: local data frame [6 x 4]
## 
##             tz source_country    wkday   hour
##          <chr>          <chr>   <fctr> <fctr>
## 1 Africa/Cairo             BG Saturday     22
## 2 Africa/Cairo             TW   Sunday      8
## 3 Africa/Cairo             TW   Sunday     10
## 4 Africa/Cairo             CN   Sunday     13
## 5 Africa/Cairo             US   Sunday     17
## 6 Africa/Cairo             CA   Monday     13

Then we can simply group the count by hour and wkday and plot it, since we know that we have values for every combination there’s no need to further preprocess the data.

grouped <- attacks %>% count( wkday, hour ) %>% ungroup()

ggplot( grouped, aes( hour, wkday, fill = n ) ) + 
geom_tile( color = "white", size = 0.1 ) + 
theme_tufte( base_family = "Helvetica" ) + 
coord_equal() + 
scale_fill_viridis( name = "# of Events", label = comma ) + 
labs( x = NULL, y = NULL, title = "Events per weekday & time of day" ) +
theme( axis.ticks = element_blank(),
       plot.title = element_text( hjust = 0.5 ),
       legend.title = element_text( size = 8 ),
       legend.text = element_text( size = 6 ) )

Some ggplot2 explanation:

That’s great, but what if we wanted the heatmap breakdown by country? We’ll can do this in two ways, first with each country’s heatmap using the same scale (using facet), then with each one using it’s own scale (using grid.arrange).

events_by_country <- count( attacks, source_country ) %>% 
                     mutate( percent = percent( n / sum(n) ) ) %>%
                     arrange( desc(n) )
head(events_by_country)
## Source: local data frame [6 x 3]
## 
##   source_country     n percent
##            <chr> <int>   <chr>
## 1             CN 85243   42.6%
## 2             US 48684   24.3%
## 3             KR 12648    6.3%
## 4             NL  8572    4.3%
## 5             VN  6340    3.2%
## 6             TW  3469    1.7%
top_country <- events_by_country$source_country[1:4]
top_country_attacks <- attacks %>%
                       filter( source_country %in% top_country ) %>% 
                       count( source_country, wkday, hour ) %>% 
                       ungroup() %>% 
                       mutate( source_country = factor( source_country, levels = top_country ) )


gg <- ggplot( top_country_attacks, aes( x = hour, y = wkday, fill = n ) ) + 
      geom_tile( color = "white", size = 0.1 ) +
      scale_fill_viridis( name = "# Events" ) + 
      coord_equal() + 
      facet_wrap( ~source_country, ncol = 2 ) +
      labs( x = NULL, y = NULL, title = "Events per weekday & time of day by country\n" ) + 
      theme_tufte( base_family = "Helvetica" ) + 
      theme( axis.ticks = element_blank(),
             axis.text = element_text( size = 8 ),
             panel.border = element_blank(),
             plot.title = element_text( hjust = 0.5 ),
             strip.text = element_text( hjust = 0.5 ),
             panel.margin = unit( 0.1, "cm" ),
             legend.position = "bottom",
             legend.title = element_text( size = 8 ),
             legend.text = element_text( size = 6 ),
             legend.key.size = unit( 0.4, "cm" ),
             legend.key.width = unit( 1, "cm" ) )
gg

plots <- lapply( top_country, function(x){

    subset_data <- top_country_attacks %>% filter( source_country == x )
    gg <- ggplot( subset_data, aes( x = hour, y = wkday, fill = n ) ) + 
          geom_tile( color = "white", size = 0.1 ) +
          scale_fill_viridis( name = "# Events" ) + 
          scale_y_discrete( expand = c( 0, 0 ) ) +
          coord_equal() + 
          labs( x = NULL, y = NULL, title = x ) + 
          theme_tufte( base_family = "Helvetica" ) + 
          theme( axis.ticks = element_blank(),
                 axis.text = element_text( size = 7 ),
                 panel.border = element_blank(),
                 plot.title = element_text( hjust = 0.5 ),
                 strip.text = element_text( hjust = 0.5 ),
                 panel.margin = unit( 0.1, "cm" ),
                 legend.position = "bottom",
                 legend.title = element_text( size = 6 ),
                 legend.text = element_text( size = 6 ),
                 legend.key.size = unit( 0.4, "cm" ) )
    return(gg)
})

# specify the additional arguments to grid.arrange
# by adding it to the list that's going to be do.called
plots[['ncol']] = 2
do.call( grid.arrange, plots )

Ming-Yu Liu

2016-07-09